## Loading required package: rpart
## X Standing Username Country
## Min. : 1 Min. : 1 Length:39769 Length:39769
## 1st Qu.: 9943 1st Qu.: 9943 Class :character Class :character
## Median :19885 Median :19885 Mode :character Mode :character
## Mean :19885 Mean :19885
## 3rd Qu.:29827 3rd Qu.:29827
## Max. :39769 Max. :39769
## Wins Games.Played Winrate APM
## Min. : 0.0 Min. : 10.0 Min. :0.0000 Min. : 1.05
## 1st Qu.: 31.0 1st Qu.: 63.0 1st Qu.:0.4844 1st Qu.: 15.11
## Median : 83.0 Median : 159.0 Median :0.5087 Median : 23.10
## Mean : 158.7 Mean : 311.2 Mean :0.4951 Mean : 30.56
## 3rd Qu.: 202.0 3rd Qu.: 389.0 3rd Qu.:0.5327 3rd Qu.: 38.13
## Max. :4001.0 Max. :8142.0 Max. :1.0000 Max. :227.68
## PPS VS Glicko.Rating Rating.Deviation
## Min. :0.300 Min. : 1.75 Min. : 265 Min. : 60.00
## 1st Qu.:0.940 1st Qu.: 32.44 1st Qu.:1168 1st Qu.: 62.00
## Median :1.170 Median : 49.86 Median :1479 Median : 72.00
## Mean :1.259 Mean : 64.76 Mean :1496 Mean : 74.94
## 3rd Qu.:1.480 3rd Qu.: 81.44 3rd Qu.:1774 3rd Qu.: 86.00
## Max. :4.270 Max. :438.21 Max. :4276 Max. :100.00
## Tetra.Rating Rank Active.This.Week Supporter.Status.
## Min. : 11.47 Length:39769 Length:39769 Length:39769
## 1st Qu.: 4531.12 Class :character Class :character Class :character
## Median : 9509.57 Mode :character Mode :character Mode :character
## Mean : 9862.82
## 3rd Qu.:14693.09
## Max. :24752.28
## RankColour
## Length:39769
## Class :character
## Mode :character
##
##
##
## 'data.frame': 39769 obs. of 17 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Standing : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Username : chr "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
## $ Country : chr "Japan" "United States" "Korea, Republic of" "Japan" ...
## $ Wins : int 1026 347 511 358 320 270 179 323 164 356 ...
## $ Games.Played : int 1233 394 670 437 454 358 229 463 206 533 ...
## $ Winrate : num 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num 228 213 195 203 191 ...
## $ PPS : num 4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
## $ VS : num 438 421 389 394 391 ...
## $ Glicko.Rating : int 4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
## $ Rating.Deviation : int 85 71 72 76 72 81 67 65 86 68 ...
## $ Tetra.Rating : num 24752 24641 24601 24591 24579 ...
## $ Rank : chr "X+" "X+" "X+" "X+" ...
## $ Active.This.Week : chr "Yes" "Yes" "Yes" "No" ...
## $ Supporter.Status.: chr "Yes" "Yes" "No" "Yes" ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
## Preparing the data
#make some things factors
data$Country = as.factor(data$Country)
data$Country = sub("Korea, Republic of", "Republic of Korea", data$Country)
data$Country = sub("Venezuela, Bolivarian Republic of", "Republic of Venezuela", data$Country)
data$Country = sub("Macedonia, the former Yugoslav Republic of", "Republic of Macedonia", data$Country)
data$Country = as.factor(data$Country)
data$Rank = factor(data$Rank, levels=c("D","D+","C-","C","C+","B-","B","B+","A-","A","A+","S-","S","S+","SS","U","X","X+"))
data$Active.This.Week = as.factor(data$Active.This.Week)
data$Active.This.Week = ifelse(data$Active.This.Week == "Yes", 1, 0)
data$Supporter.Status. = as.factor(data$Supporter.Status.)
data$Supporter.Status. = ifelse(data$Supporter.Status. == "Yes", 1, 0)
data$Wins = as.numeric(data$Wins)
data$Games.Played = as.numeric(data$Games.Played)
data$Username = as.character(data$Username)
#remove index (standing does this)
data$X = NULL
str(data)## 'data.frame': 39769 obs. of 16 variables:
## $ Standing : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Username : chr "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
## $ Country : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
## $ Wins : num 1026 347 511 358 320 ...
## $ Games.Played : num 1233 394 670 437 454 ...
## $ Winrate : num 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num 228 213 195 203 191 ...
## $ PPS : num 4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
## $ VS : num 438 421 389 394 391 ...
## $ Glicko.Rating : int 4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
## $ Rating.Deviation : int 85 71 72 76 72 81 67 65 86 68 ...
## $ Tetra.Rating : num 24752 24641 24601 24591 24579 ...
## $ Rank : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ Active.This.Week : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Supporter.Status.: num 1 1 0 1 1 1 0 1 1 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
num_of_splits = 10
splits = sample( rep(1:num_of_splits, ceiling(nrow(data)/num_of_splits) ), nrow(data) )## 1 2 3 4 5 6 7 8 9 10
## 3977 3977 3977 3977 3977 3977 3976 3977 3977 3977
## int [1:39769] 7 9 8 4 4 6 3 9 10 10 ...
train = data[splits!=1,]
test = data[splits==1,]
#nrow(train) + nrow(test)
#nrow(test)
#dynamically
#testdata(7)
#=> train
#=>testdata_normalized = data
for(i in c(1,4,5,6,7,8,9,10,11,12)) {
data_normalized[,i] = scale(
data_normalized[,i],
center=min(data_normalized[,i]),
scale=max(data_normalized[,i])-min(data_normalized[,i]))
}
str(data_normalized, give.attr=F)## 'data.frame': 39769 obs. of 16 variables:
## $ Standing : num [1:39769, 1] 0 0.0000251 0.0000503 0.0000754 0.0001006 ...
## $ Username : chr "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
## $ Country : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
## $ Wins : num [1:39769, 1] 0.2564 0.0867 0.1277 0.0895 0.08 ...
## $ Games.Played : num [1:39769, 1] 0.1504 0.0472 0.0812 0.0525 0.0546 ...
## $ Winrate : num [1:39769, 1] 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num [1:39769, 1] 1 0.937 0.854 0.889 0.838 ...
## $ PPS : num [1:39769, 1] 1 0.912 0.778 0.889 0.786 ...
## $ VS : num [1:39769, 1] 1 0.96 0.887 0.898 0.892 ...
## $ Glicko.Rating : num [1:39769, 1] 1 0.938 0.922 0.917 0.914 ...
## $ Rating.Deviation : num [1:39769, 1] 0.625 0.275 0.3 0.4 0.3 0.525 0.175 0.125 0.65 0.2 ...
## $ Tetra.Rating : num [1:39769, 1] 1 0.995 0.994 0.993 0.993 ...
## $ Rank : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ Active.This.Week : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Supporter.Status.: num 1 1 0 1 1 1 0 1 1 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
#handling flags
#turn rank factor into flags
for (i in 1:length(levels(data_normalized$Rank))) {
data_normalized[,ncol(data_normalized)+1] = 0 #make new column
#set appropriate ones to 1
data_normalized[
data_normalized$Rank == levels(data_normalized$Rank)[i], #select rows matching rank
ncol(data_normalized)] = 1 #select last column (just added)
varname = sprintf( "flag%sRank", levels(data_normalized$Rank)[i] )
varname = sub("+", "Plus", varname, fixed=T) #fixed=T treats '+' literal
varname = sub("-", "Minus", varname, fixed=T)
names(data_normalized)[ ncol(data_normalized) ] = varname
}
# remove orig rank var and remove one flag
data_normalized$Rank = NULL
data_normalized$flagDrank = NULL
str(data_normalized, give.attr=F)## 'data.frame': 39769 obs. of 33 variables:
## $ Standing : num [1:39769, 1] 0 0.0000251 0.0000503 0.0000754 0.0001006 ...
## $ Username : chr "5HAN" "CABOOZLED_PIE" "TURTLE" "SYAKEGOHAN" ...
## $ Country : Factor w/ 225 levels "","Afghanistan",..: 103 212 162 103 157 145 162 212 89 95 ...
## $ Wins : num [1:39769, 1] 0.2564 0.0867 0.1277 0.0895 0.08 ...
## $ Games.Played : num [1:39769, 1] 0.1504 0.0472 0.0812 0.0525 0.0546 ...
## $ Winrate : num [1:39769, 1] 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num [1:39769, 1] 1 0.937 0.854 0.889 0.838 ...
## $ PPS : num [1:39769, 1] 1 0.912 0.778 0.889 0.786 ...
## $ VS : num [1:39769, 1] 1 0.96 0.887 0.898 0.892 ...
## $ Glicko.Rating : num [1:39769, 1] 1 0.938 0.922 0.917 0.914 ...
## $ Rating.Deviation : num [1:39769, 1] 0.625 0.275 0.3 0.4 0.3 0.525 0.175 0.125 0.65 0.2 ...
## $ Tetra.Rating : num [1:39769, 1] 1 0.995 0.994 0.993 0.993 ...
## $ Active.This.Week : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Supporter.Status.: num 1 1 0 1 1 1 0 1 1 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
## $ flagDRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagDPlusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagCMinusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagCRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagCPlusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagBMinusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagBRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagBPlusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagAMinusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagARank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagAPlusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagSMinusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagSRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagSPlusRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagSSRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagURank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagXRank : num 0 0 0 0 0 0 0 0 0 0 ...
## $ flagXPlusRank : num 1 1 1 1 1 1 1 1 1 1 ...
#make training set with higher proportion of supporters (50/50 split)
allsupporters = data[data_normalized$Supporter.Status. == 1,]
allnonsupporters = data[data_normalized$Supporter.Status. == 0,]
allsupporterstrain_balanced = rbind( allsupporters,
allnonsupporters[sample(1:nrow(allnonsupporters), nrow(allsupporters)),] )
#shuffle it
train_balanced = train_balanced[sample(1:nrow(train), nrow(train)),]# Z-score standardization
data_standardized = data
#standardization of numeric variables, for decision trees is not necessary
data_standardized$Wins.s = (data$Wins - mean(data$Wins))/sd(data$Wins)
data_standardized$Games.Played.s = (data$Games.Played - mean(data$Games.Played))/sd(data$Games.Played)
data_standardized$Tetra.Rating.s = (data$Tetra.Rating - mean(data$Tetra.Rating))/sd(data$Tetra.Rating)
data_standardized# Tree data
#data_without_support = data
#test_without_support = test
#train_without_support = train
#drop support
#data_without_support$Supporter.Status. = NULL
#test_without_support$Supporter.Status. = NULL
#train_without_support$Supporter.Status. = NULL
#Train: drop username, country
train_without_username = train
train_without_username$Username = NULL
train_without_username$Country = NULL
#Test: drop username, country
test_without_username = test
test_without_username$Username = NULL
test_without_username$Country = NULL
train_balanced_without_username = train_balanced
train_balanced_without_username$Username = NULL
train_balanced_without_username$Country = NULL
#train_without_username$Standing = NULLset.seed(1)
#cartfittrain = rpart(Supporter.Status.~., dat=train_without_username, method="class", control=rpart.control(minsplit=4, cp=0.0015))
cartfittrain = rpart(Supporter.Status.~., dat=train_balanced_without_username, method="class", control=rpart.control(minsplit=4, cp=0.003))
#cartfittrain = rpart(Supporter.Status.~ Glicko.Rating+Country, dat=train_without_username, method="class")
rpart.plot(cartfittrain, type=2)cartfittrain2 = rpart(Supporter.Status.~., dat=train_without_username[train_without_username$Standing<400,], method="class", control=rpart.control(minsplit=4, cp=0.03))
rpart.plot(cartfittrain2, type=2)## 'data.frame': 35792 obs. of 14 variables:
## $ Standing : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Wins : num 1026 347 511 358 320 ...
## $ Games.Played : num 1233 394 670 437 454 ...
## $ Winrate : num 0.832 0.881 0.763 0.819 0.705 ...
## $ APM : num 228 213 195 203 191 ...
## $ PPS : num 4.27 3.92 3.39 3.83 3.42 3.84 3.44 3.56 3.04 3.84 ...
## $ VS : num 438 421 389 394 391 ...
## $ Glicko.Rating : int 4276 4026 3963 3944 3931 3899 3862 3854 3853 3767 ...
## $ Rating.Deviation : int 85 71 72 76 72 81 67 65 86 68 ...
## $ Tetra.Rating : num 24752 24641 24601 24591 24579 ...
## $ Rank : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 18 18 18 18 18 18 ...
## $ Active.This.Week : num 1 1 1 0 1 1 1 1 1 1 ...
## $ Supporter.Status.: num 1 1 0 1 1 1 0 1 1 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...
cartfittest2 = rpart(Supporter.Status.~., dat=test_without_username[train_without_username$Standing<400,], method="class", control=rpart.control(minsplit=4, cp=0.03))
rpart.plot(cartfittest2, type=2)## 'data.frame': 3977 obs. of 14 variables:
## $ Standing : int 12 24 35 38 96 97 98 102 131 142 ...
## $ Wins : num 152 263 124 736 181 695 680 98 783 183 ...
## $ Games.Played : num 205 397 166 1354 275 ...
## $ Winrate : num 0.742 0.662 0.747 0.544 0.658 ...
## $ APM : num 191 173 172 153 139 ...
## $ PPS : num 3.59 3.54 3.16 3.49 2.45 2.62 2.5 3.18 2.78 2.82 ...
## $ VS : num 374 352 334 310 298 ...
## $ Glicko.Rating : int 3722 3599 3523 3467 3231 3218 3218 3215 3103 3060 ...
## $ Rating.Deviation : int 64 69 88 63 76 64 64 93 65 69 ...
## $ Tetra.Rating : num 24424 24297 24192 24146 23767 ...
## $ Rank : Factor w/ 18 levels "D","D+","C-",..: 18 18 18 18 17 17 17 17 17 17 ...
## $ Active.This.Week : num 1 0 0 1 1 1 1 0 1 0 ...
## $ Supporter.Status.: num 0 1 0 0 0 1 0 0 0 0 ...
## $ RankColour : chr "#A763EA" "#A763EA" "#A763EA" "#A763EA" ...